We transform the data to reflect the desired structure for modeling.

diabetes_full <- read.csv("/Users/benabijah/Downloads/diabetes_cohort.csv", header = T) %>%
  mutate(BENE_ID = as.character(BENE_ID)) %>%
  select(-c(MONTH, YR))

# define the list of other diagnoses
diab_names <- c("Prediabetes","Diabetes mellitus without complication","Diabetes mellitus with complication")

other_diagnoses <- unique(diabetes_full$CCSR_CATEGORY_DESCRIPTION[
  !(diabetes_full$CCSR_CATEGORY_DESCRIPTION %in% diab_names)])
# get data for all non-diabetes diagnoses
other_diagnoses_data <- diabetes_full %>%
  filter(CCSR_CATEGORY_DESCRIPTION %in% other_diagnoses) %>%
  mutate(status = 1) %>%
  select(-STATE) %>%
  pivot_wider(names_from = CCSR_CATEGORY_DESCRIPTION, values_from = status)

other_diagnoses_data
# get data for all diabetes diagnoses
diabetes_only_data <- diabetes_full %>%
  filter(CCSR_CATEGORY_DESCRIPTION %in% diab_names) %>%
  mutate(STATE = case_when(
    CCSR_CATEGORY_DESCRIPTION == "Prediabetes" ~ 1,
    CCSR_CATEGORY_DESCRIPTION == "Diabetes mellitus without complication" ~ 2,
    CCSR_CATEGORY_DESCRIPTION == "Diabetes mellitus with complication" ~ 3)) %>%
  select(-CCSR_CATEGORY_DESCRIPTION) %>%
  arrange(desc(BENE_ID), TIME_SINCE_PREDIAB) 

diabetes_only_data
diabetes_transformed <- full_join(
  diabetes_only_data, other_diagnoses_data, by = c("BENE_ID","TIME_SINCE_PREDIAB")) %>%
  mutate(STATE = ifelse(is.na(STATE), 0, STATE)) %>%
  arrange(desc(BENE_ID), TIME_SINCE_PREDIAB)

diabetes_transformed
# now let's update the STATE column...
update_state_iteratively <- function(data) {
  repeat {
    # store previous state values 
    prev_state <- data$STATE
    
    # update STATE iteratively for second to last rows
    data <- data %>%
      group_by(BENE_ID) %>%
      mutate(STATE = ifelse(row_number() > 1 & STATE == 0, lag(STATE), STATE)) %>%
      ungroup()
    
    # break the loop if no changes were made
    if (all(prev_state == data$STATE)) break
  }
  return(data)
}

diabetes_transformed <- update_state_iteratively(diabetes_transformed)
diabetes_transformed
# update comorbodities
update_diagnoses_iteratively <- function(data, cols) {
  data <- data %>%
    group_by(BENE_ID) %>%
    mutate(across(all_of(cols), ~ ifelse(row_number() == 1 & is.na(.x), 0, .x))) %>% # set first row to 0, if NA
    ungroup()
  
  repeat {
    prev_data <- data[cols]  #store previous state for checking convergence
    
    # perform iterative update until all NAs are filled
    data <- data %>%
      group_by(BENE_ID) %>%
      mutate(across(all_of(cols), ~ ifelse(is.na(.x), lag(.x, default = 0), .x))) %>%
      ungroup()
    
    # break loop if no changes are made
    if (identical(prev_data, data[cols])) break
  }
  
  return(data)
}

diabetes_transformed <- update_diagnoses_iteratively(diabetes_transformed, other_diagnoses)

# filter out STATE=0
diabetes_transformed <- diabetes_transformed %>% filter(STATE != 0)
diabetes_transformed
# merge with data from enrollment file
enrollment_diab <- read.csv("/Users/benabijah/Downloads/enrollment_cohort.csv", header = T) %>%
  mutate(BENE_ID = as.character(BENE_ID))

diabetes_transformed <- left_join(diabetes_transformed, enrollment_diab, by = c("BENE_ID"))
diabetes_transformed
write.csv(diabetes_transformed, "~/Library/CloudStorage/Dropbox/Projects/EHR/diab_cohort.csv", row.names = FALSE)
saveRDS(diabetes_transformed, file="~/Library/CloudStorage/Dropbox/Projects/EHR/diab_cohort.rds")